In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as datetime
from impala.util import as_pandas
from impala.dbapi import connect
plt.style.use('ggplot')
%matplotlib notebook
In [ ]:
conn = connect(host="mycluster.domain.com", port=my_impala_port_number)
cur = conn.cursor()
cur.execute("use my_db")
In [ ]:
query_rrt_counts = """
SELECT enc.encntr_id, COUNT(1) AS count
FROM encounter enc
INNER JOIN clinical_event ce
ON enc.encntr_id = ce.encntr_id
WHERE enc.loc_facility_cd='633867'
AND enc.encntr_complete_dt_tm < 4e12
AND ce.event_cd='54411998'
AND ce.result_status_cd NOT IN ('31', '36')
AND ce.valid_until_dt_tm > 4e12
AND ce.event_class_cd not in ('654645')
AND enc.admit_type_cd != '0'
AND enc.encntr_type_class_cd = '391'
GROUP BY enc.encntr_id
ORDER BY enc.encntr_id;
"""
In [ ]:
cur.execute(query_rrt_counts)
df = as_pandas(cur)
In [ ]:
df.head()
In [ ]:
# Number of encounters with multiple RRTs:
len(df[df['count']>1])
In [ ]:
# to get just the encounter ids with counts >1
encs = df[df['count']>1]['encntr_id'].get_values()
In [ ]:
encs
In [ ]:
len(df)
In [ ]:
df.sort_values(by='count', ascending=False).head(10)
In [ ]:
df[df['count']>1].hist()
In [ ]:
# Print out the data related to the encounter with multiple RRT events, if the duration beteween RRT events is less than 1 hour,
# or if the start & end time of the encounter doesn't make sense related to the time of the RRT event.
count = 0
for enc in encs:
count+=1
print "count: {0}".format(count)
query = """
SELECT
ce.encntr_id
, ce.event_id
, ce.valid_until_dt_tm
, from_unixtime(CAST(ce.event_end_dt_tm / 1000 as bigint)) AS event_end
, ce.event_end_dt_tm
, from_unixtime(CAST(ce.valid_from_dt_tm / 1000 as bigint)) AS valid_from
, from_unixtime(CAST(enc.arrive_dt_tm/1000 as bigint)) AS enc_arrive
, enc.arrive_dt_tm
, COALESCE(tci.checkin_dt_tm, enc.arrive_dt_tm) AS check_in_time
, from_unixtime(CAST(COALESCE(tci.checkin_dt_tm
, enc.arrive_dt_tm)/1000 as bigint)) AS check_in
, from_unixtime(CAST(enc.depart_dt_tm/1000 as bigint)) AS enc_depart
, enc.depart_dt_tm
FROM clinical_event ce
INNER JOIN encounter enc
ON ce.encntr_id = enc.encntr_id
LEFT OUTER JOIN (
SELECT
ti.encntr_id AS encntr_id
, MIN(tc.checkin_dt_tm) AS checkin_dt_tm
FROM tracking_item ti
JOIN tracking_checkin tc ON ti.tracking_id = tc.tracking_id
GROUP BY ti.encntr_id
) tci
ON tci.encntr_id = enc.encntr_id
WHERE ce.event_cd = '54411998' AND ce.encntr_id='{0}'
AND ce.valid_until_dt_tm>4e12;
""".format(enc)
cur.execute(query)
dfenc = as_pandas(cur)
dfenc['dep-perf_hrs'] = ((dfenc['depart_dt_tm']-dfenc['event_end_dt_tm'])/3600000)
dfenc['perf-arr_hrs'] = ((dfenc['event_end_dt_tm']-dfenc['check_in_time'])/3600000)
# if there's a problem, print the dfenc
if any(dfenc['dep-perf_hrs']<0):
print dfenc
if any(dfenc['perf-arr_hrs']<0):
print dfenc
RRTtimes = sorted(dfenc['event_end_dt_tm'].get_values())
time_lastloop = 0
for RRTtime in RRTtimes:
if time_lastloop == 0:
time_lastloop = RRTtime
else:
if (RRTtime-time_lastloop)/3600000. < 1:
print dfenc
In [ ]:
# Several of these have elapsed intervals under 1 hour. ==> Very unlikely.
# Sometimes, info can get entered twice into the system, esp if there is a shift change.
In [ ]:
dfenc